VARIABLE SELECTION

df <- read_excel("df_definitivo-2.xlsx", sheet = "Copia de datos")
rownames(df) = df$id
## Warning: Setting row names on a tibble is deprecated.
df$Afectacion_ganglionar = as.numeric(df$Afectacion_ganglionar)
df$Afectacion_metastasica = as.numeric(df$Afectacion_metastasica)
df$Tamaño_tumor = as.numeric(df$Tamaño_tumor)
df$Grado_tox = as.numeric(df$Grado_tox)
df$SII_pre = log(df$SII_pre)
df$SII_1C = log(df$SII_1C)
df$SII_2C = log(df$SII_2C)
df$SII_1eval = log(df$SII_1eval)

variables_inutiles = c("Anciano", "Peso", "Talla", "SG", "SG_cens", "SLP", "SLP_cens", "Tipo_tox", "NLR2C_corte4o5")
df2 = select(df, -all_of(variables_inutiles))

PLS

SPECIFIC VARIABLES FIRST EVAL

#colnames(df2)
X1.1 = df2[,2:49]
X1.2 = df2[,61:67]
X = bind_cols(X1.1, X1.2)
Y = as.matrix(df2['pri_eval_num_ok'])

SPECIFIC VARIABLES BEST RESPONSE

#colnames(df2)
X2 = df2[,2:67]
Y2 = as.matrix(df2["mejor_resp_num_ok"])

SCALING

mypls = opls(x = X, y = Y, predI = NA, crossvalI = nrow(X), scaleC = "standard", fig.pdfC = "none")
## PLS
## 34 samples x 55 variables and 1 response
## standard scaling of predictors and response(s)
##       R2X(cum) R2Y(cum) Q2(cum) RMSEE pre ort pR2Y  pQ2
## Total    0.147    0.526   0.295 0.489   1   0 0.15 0.05

NUMBER OF COMPONENTS SELECTION

mypls_2 = opls(x = X, y = Y, predI = 33, crossvalI = nrow(X), scaleC = "standard", fig.pdfC = "none")
## PLS
## 34 samples x 55 variables and 1 response
## standard scaling of predictors and response(s)
##       R2X(cum) R2Y(cum) Q2(cum) RMSEE pre ort pR2Y  pQ2
## Total        1        1       1   Inf  33   0 1.05 1.05
plot(1:33, mypls_2@modelDF$`R2Y(cum)`, type = "o", pch = 16, col = "blue3",
     lwd = 2, xlab = "Components", ylab = "",
     main = "PLS model for predicting First Evaluation", ylim = c(-1,1))
lines(1:33, mypls_2@modelDF$`Q2(cum)`, type = "o", pch = 16, col = "red3",
      lwd = 2)
abline(h = 0, col = "red3", lty = 2)
legend("bottomleft", c("R2Y", "Q2"), lwd = 2, 
       col = c("blue3", "red3"), bty = "n")

mypls = opls(x = X, y = Y, predI = 2, crossvalI = nrow(X), scaleC = "standard", fig.pdfC = "none")
## PLS
## 34 samples x 55 variables and 1 response
## standard scaling of predictors and response(s)
##       R2X(cum) R2Y(cum) Q2(cum) RMSEE pre ort pR2Y  pQ2
## Total    0.242      0.7   0.178 0.395   2   0  0.1 0.05

EXTREME INDIVIDUALS STUDY

Para estudiar los datos anómalos haremos uso de la T2 de Hotelling y la Suma de Cuadrados Residual.

En el PCA observabamos que el individuo 11 es un dato atipico extremo. Pero igualmente lo representaremos en el gráfico de la T2 de hotelling.

misScores = mypls@scoreMN
varT = apply(misScores, 2, var)
miT2 = colSums(t(misScores**2) / varT)
N = nrow(X)
A = 2
F95 = A*(N**2 - 1)/(N*(N - A)) * qf(0.95, A, N-A); F95
## [1] 6.994835
F99 = A*(N**2 - 1)/(N*(N - A)) * qf(0.99, A, N-A); F99
## [1] 11.32992
plot(1:length(miT2), miT2, type = "l", xlab = "pacientes", ylab = "T2",
     main = "PLS: T2-Hotelling", ylim = c(0,15))
abline(h = F95, col = "orange", lty = 2, lwd = 2)
abline(h = F99, col = "red3", lty = 2, lwd = 2)

myT = mypls@scoreMN
myP = mypls@loadingMN
myE = scale(X) - myT%*%t(myP) 
mySCR = rowSums(myE^2)   # SPE 
plot(1:length(mySCR), mySCR, type = "l", main = "PLS: SCR Distance to the model", 
     ylab = "SCR", xlab = "pacientes", ylim = c(0,100))
g = var(mySCR)/(2*mean(mySCR))
h = (2*mean(mySCR)^2)/var(mySCR)
chi2lim = g*qchisq(0.95, df = h)
abline(h = chi2lim, col = "orange", lty = 2)
chi2lim99 = g*qchisq(0.99, df = h)
abline(h = chi2lim99, col = "red3", lty = 2)

GRPAHICS

plot(x = mypls, typeVc = "x-score", parCompVi = c(1, 2), parLabVc = rownames(X), parPaletteVc = NA, parTitleL = TRUE, parCexMetricN = NA)

plot(x = mypls, typeVc = "x-loading",
     parCexN = 0.8, parCompVi = c(1, 2), parPaletteVc = NA,
     parTitleL = TRUE, parCexMetricN = NA)

plot(x = mypls, typeVc = "xy-weight",
     parCexN = 0.5, parCompVi = c(1, 2), parPaletteVc = NA, 
     parTitleL = TRUE, parCexMetricN = NA)

VIP = data.frame(sort(mypls@vipVn))
bottom_10 <- data.frame(variables = rownames(VIP)[1:10], vip = VIP[1:10, ])
top_10 <- data.frame(variables = rownames(VIP)[46:55], vip = VIP[46:55, ])

grafico_barras <- ggplot(top_10, aes(x = reorder(variables, vip), y = vip)) + geom_bar(stat = "identity", fill = "skyblue") + theme(axis.text.x = element_text(angle = 45, hjust = 1)) +  labs(x = "Variable", y = "VIP", title = "Top 10 VIP variables")

grafico_barras2 <- ggplot(bottom_10, aes(x = reorder(variables, vip), y = vip)) + geom_bar(stat = "identity", fill = "skyblue") + theme(axis.text.x = element_text(angle = 45, hjust = 1)) +  labs(x = "Variable", y = "VIP", title = "Bottom 10 VIP variables")

print(grafico_barras)

print(grafico_barras2)

EVALUATION METRICS FOR REGRESSION

Ypred = predict(mypls)
residuos = Y-Ypred
myRMSE = sqrt(colMeans(residuos^2))
CVrmse = myRMSE/colMeans(Y)
myRMSE 
## pri_eval_num_ok 
##       0.3771181
CVrmse
## pri_eval_num_ok 
##       0.2137002
for (i in 1:ncol(Y)) {
  plot(Y[,i], Ypred[,i], asp = 1, main = colnames(Y)[i],
     xlab = "observado", ylab = "predicho")
abline(a=0, b=1, col = "red3", lwd = 2, lty = 2)
}

SCALING

mypls2 = opls(x = X2, y = Y2, predI = NA, crossvalI = nrow(X), scaleC = "standard", fig.pdfC = "none")
## PLS
## 34 samples x 66 variables and 1 response
## standard scaling of predictors and response(s)
##       R2X(cum) R2Y(cum) Q2(cum) RMSEE pre ort pR2Y  pQ2
## Total    0.172    0.595   0.416 0.593   1   0 0.05 0.05
mypls_2.2 = opls(x = X2, y = Y2, predI = 33, crossvalI = nrow(X), scaleC = "standard", fig.pdfC = "none")
## PLS
## 34 samples x 66 variables and 1 response
## standard scaling of predictors and response(s)
##       R2X(cum) R2Y(cum) Q2(cum) RMSEE pre ort pR2Y  pQ2
## Total        1        1       1   Inf  33   0 1.05 1.05

NUMBER OF COMPONENTS SELECTION

plot(1:33, mypls_2.2@modelDF$`R2Y(cum)`, type = "o", pch = 16, col = "blue3",
     lwd = 2, xlab = "Components", ylab = "",
     main = "PLS model for predicting Best Response", ylim = c(-1,1))
lines(1:33, mypls_2.2@modelDF$`Q2(cum)`, type = "o", pch = 16, col = "red3",
      lwd = 2)
abline(h = 0, col = "red3", lty = 2)
legend("bottomleft", c("R2Y", "Q2"), lwd = 2, 
       col = c("blue3", "red3"), bty = "n")

mypls_2 = opls(x = X2, y = Y2, predI = 4, crossvalI = nrow(X), scaleC = "standard", fig.pdfC = "none")
## PLS
## 34 samples x 66 variables and 1 response
## standard scaling of predictors and response(s)
##       R2X(cum) R2Y(cum) Q2(cum) RMSEE pre ort pR2Y  pQ2
## Total    0.475    0.915   0.381 0.286   4   0 0.05 0.05

EXTREME INDIVIDUALS STUDY

Para estudiar los datos anómalos haremos uso de la T2 de Hotelling y la Suma de Cuadrados Residual.

En el PCA observabamos que el individuo 11 es un dato atipico extremo. Pero igualmente lo representaremos en el gráfico de la T2 de hotelling.

misScores = mypls_2@scoreMN
varT = apply(misScores, 2, var)
miT2 = colSums(t(misScores**2) / varT)
N = nrow(X2)
A = 2
F95 = A*(N**2 - 1)/(N*(N - A)) * qf(0.95, A, N-A); F95
## [1] 6.994835
F99 = A*(N**2 - 1)/(N*(N - A)) * qf(0.99, A, N-A); F99
## [1] 11.32992
plot(1:length(miT2), miT2, type = "l", xlab = "pacientes", ylab = "T2",
     main = "PLS: T2-Hotelling", ylim = c(0,15))
abline(h = F95, col = "orange", lty = 2, lwd = 2)
abline(h = F99, col = "red3", lty = 2, lwd = 2)

El individuo 11 sigue saliendo como dato extremo, aun realizando transformaciones logarítmicas.

myT = mypls_2@scoreMN
myP = mypls_2@loadingMN
myE = scale(X2) - myT%*%t(myP) 
mySCR = rowSums(myE^2)   # SPE 
plot(1:length(mySCR), mySCR, type = "l", main = "PLS: SCR Distance to the model", 
     ylab = "SCR", xlab = "pacientes", ylim = c(0,100))
g = var(mySCR)/(2*mean(mySCR))
h = (2*mean(mySCR)^2)/var(mySCR)
chi2lim = g*qchisq(0.95, df = h)
abline(h = chi2lim, col = "orange", lty = 2)
chi2lim99 = g*qchisq(0.99, df = h)
abline(h = chi2lim99, col = "red3", lty = 2)

GRAPHICS

plot(x = mypls_2, typeVc = "x-score", parCompVi = c(1, 2), parLabVc = rownames(X), parPaletteVc = NA, parTitleL = TRUE, parCexMetricN = NA)

plot(x = mypls_2, typeVc = "x-score", parCompVi = c(3, 4), parLabVc = rownames(X), parPaletteVc = NA, parTitleL = TRUE, parCexMetricN = NA)

plot(x = mypls_2, typeVc = "x-score", parCompVi = c(1,3), parLabVc = rownames(X), parPaletteVc = NA, parTitleL = TRUE, parCexMetricN = NA)

plot(x = mypls_2, typeVc = "x-score", parCompVi = c(1, 4), parLabVc = rownames(X), parPaletteVc = NA, parTitleL = TRUE, parCexMetricN = NA)

plot(x = mypls_2, typeVc = "x-loading",
     parCexN = 0.8, parCompVi = c(1, 2), parPaletteVc = NA,
     parTitleL = TRUE, parCexMetricN = NA)

plot(x = mypls_2, typeVc = "xy-weight",
     parCexN = 0.5, parCompVi = c(1, 2), parPaletteVc = NA, 
     parTitleL = TRUE, parCexMetricN = NA)

plot(x = mypls_2, typeVc = "xy-weight",
     parCexN = 0.5, parCompVi = c(3, 4), parPaletteVc = NA, 
     parTitleL = TRUE, parCexMetricN = NA)

VIP2 = data.frame(sort(mypls_2@vipVn))
bottom_10 <- data.frame(variables = rownames(VIP2)[1:10], vip = VIP[1:10, ])
top_10 <- data.frame(variables = rownames(VIP2)[46:55], vip = VIP[46:55, ])

grafico_barras <- ggplot(top_10, aes(x = reorder(variables, vip), y = vip)) + geom_bar(stat = "identity", fill = "skyblue") + theme(axis.text.x = element_text(angle = 45, hjust = 1)) +  labs(x = "Variable", y = "VIP", title = "Top 10 VIP variables")

grafico_barras2 <- ggplot(bottom_10, aes(x = reorder(variables, vip), y = vip)) + geom_bar(stat = "identity", fill = "skyblue") + theme(axis.text.x = element_text(angle = 45, hjust = 1)) +  labs(x = "Variable", y = "VIP", title = "Bottom 10 VIP variables")

print(grafico_barras)

print(grafico_barras2)

# EVALUATION METRICS FOR REGRESSION

Ypred2 = predict(mypls_2)
residuos2 = Y2-Ypred2
myRMSE2 = sqrt(colMeans(residuos2^2))
CVrmse2 = myRMSE2/colMeans(Y2)
myRMSE2 
## mejor_resp_num_ok 
##         0.2638897
CVrmse2
## mejor_resp_num_ok 
##         0.1950489

El RMSE expresa la diferencia entre los valores predichos por nuestro modelo y los observados. Es decir, en promedio el valor difiere +/- 0.2638 del valor real. Es normal porque los valores que se observan son números enteros (1, 2 y 3) y los valores que se predicen son números reales entre el 1 y el 3.

El CVrmse que obtenemos es de 0.195 lo que significa que las predicciones del modelo tienen un error del 19.5% en relación con la escala de los datos originales. Con este valor podriamos decir que el modelo tiene un rendimiento moderado-aceptable.

for (i in 1:ncol(Y)) {
  plot(Y2[,i], Ypred2[,i], asp = 1, main = colnames(Y2)[i],
     xlab = "observado", ylab = "predicho")
abline(a=0, b=1, col = "red3", lwd = 2, lty = 2)
}

PLS-DA

SPECIFIC VARIABLES FIRST EVAL

X1.1 = df2[,2:49]
X1.2 = df2[,61:67]
X = bind_cols(X1.1, X1.2)
Y = df2$pri_eval_num_ok
Y = as.matrix(as.factor(Y))

SPECIFIC VARIABLES BEST RESPONSE

X2 = df2[,2:67]
Y2 = df2$mejor_resp_num_ok
Y2 = as.matrix(as.factor(Y2))

SCALING

myplsda = opls(x = X, y = Y, predI = 33, crossvalI = nrow(X), scaleC = "standard", fig.pdfC = "none")
## PLS-DA
## 34 samples x 55 variables and 1 response
## standard scaling of predictors and response(s)
##       R2X(cum) R2Y(cum) Q2(cum) RMSEE pre ort pR2Y  pQ2
## Total        1        1       1   Inf  33   0 1.05 1.05

NUMBER OF COMPONENTS SELECTION

plot(1:33, myplsda@modelDF$`R2Y(cum)`, type = "o", pch = 16, col = "blue3",
     lwd = 2, xlab = "Components", ylab = "", ylim = c(-0.5, 1),
     main = "PLS-DA model for predicting First Evaluation")
lines(1:33, myplsda@modelDF$`Q2(cum)`, type = "o", pch = 16, col = "red3",
      lwd = 2)
abline(h = 0, col = "red3", lty = 2)
legend("bottomleft", c("R2Y", "Q2"), lwd = 2, 
       col = c("blue3", "red3"), bty = "n")

myplsda = opls(x = X, y = Y, predI = 2, crossvalI = nrow(X), scaleC = "standard", fig.pdfC = "none")
## PLS-DA
## 34 samples x 55 variables and 1 response
## standard scaling of predictors and response(s)
##       R2X(cum) R2Y(cum) Q2(cum) RMSEE pre ort pR2Y  pQ2
## Total     0.25    0.442   -0.14 0.361   2   0  0.4 0.25

EXTREME INDIVIDUALS STUDY

misScores = myplsda@scoreMN
varT = apply(misScores, 2, var)
miT2 = colSums(t(misScores**2) / varT)
N = nrow(X)
A = 3
F95 = A*(N**2 - 1)/(N*(N - A)) * qf(0.95, A, N-A);
F99 = A*(N**2 - 1)/(N*(N - A)) * qf(0.99, A, N-A);

plot(1:length(miT2), miT2, type = "l", xlab = "Patients", ylab = "T2",
     main = "PLS: T2-Hotelling")
abline(h = F95, col = "orange", lty = 2, lwd = 2)
abline(h = F99, col = "red3", lty = 2, lwd = 2)

myT = myplsda@scoreMN
myP = myplsda@loadingMN
myE = scale(X) - myT%*%t(myP) 
mySCR = rowSums(myE^2)   # SPE 
plot(1:length(mySCR), mySCR, type = "l", main = "SCR", 
     xlab = "Patients")

g = var(mySCR)/(2*mean(mySCR))
h = (2*mean(mySCR)^2)/var(mySCR)
chi2lim = g*qchisq(0.95, df = h)
abline(h = chi2lim, col = "orange", lty = 2)
chi2lim99 = g*qchisq(0.99, df = h)
abline(h = chi2lim99, col = "red3", lty = 2)

GRAPHICS

plot(x = myplsda, typeVc = "x-score",
     parCexN = 0.5, parCompVi = c(1, 2), parPaletteVc = NA,
     parTitleL = TRUE, parCexMetricN = NA)

plot(x = myplsda, typeVc = "xy-weight",
     parCexN = 0.5, parCompVi = c(1, 2), parPaletteVc = NA, 
     parTitleL = TRUE, parCexMetricN = NA)

VIP = data.frame(sort(myplsda@vipVn))
bottom_10 <- data.frame(variables = rownames(VIP)[1:10], vip = VIP[1:10, ])
top_10 <- data.frame(variables = rownames(VIP)[46:55], vip = VIP[46:55, ])

grafico_barras <- ggplot(top_10, aes(x = reorder(variables, vip), y = vip)) + geom_bar(stat = "identity", fill = "skyblue") + theme(axis.text.x = element_text(angle = 45, hjust = 1)) +  labs(x = "Variable", y = "VIP", title = "Top 10 VIP variables")

grafico_barras2 <- ggplot(bottom_10, aes(x = reorder(variables, vip), y = vip)) + geom_bar(stat = "identity", fill = "skyblue") + theme(axis.text.x = element_text(angle = 45, hjust = 1)) +  labs(x = "Variable", y = "VIP", title = "Bottom 10 VIP variables")

print(grafico_barras)

print(grafico_barras2)

EVALUATION METRICS FOR CLASSIFICATION

myplsda_pred = predict(myplsda)
actual = factor(df2$pri_eval_num_ok)
myplsda_pred <- factor(myplsda_pred, levels = levels(actual))
confusionMatrix(myplsda_pred, actual, positive = "3")
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction  1  2  3
##          1 11  4  0
##          2  2 11  0
##          3  0  1  5
## 
## Overall Statistics
##                                         
##                Accuracy : 0.7941        
##                  95% CI : (0.621, 0.913)
##     No Information Rate : 0.4706        
##     P-Value [Acc > NIR] : 0.0001154     
##                                         
##                   Kappa : 0.6708        
##                                         
##  Mcnemar's Test P-Value : NA            
## 
## Statistics by Class:
## 
##                      Class: 1 Class: 2 Class: 3
## Sensitivity            0.8462   0.6875   1.0000
## Specificity            0.8095   0.8889   0.9655
## Pos Pred Value         0.7333   0.8462   0.8333
## Neg Pred Value         0.8947   0.7619   1.0000
## Prevalence             0.3824   0.4706   0.1471
## Detection Rate         0.3235   0.3235   0.1471
## Detection Prevalence   0.4412   0.3824   0.1765
## Balanced Accuracy      0.8278   0.7882   0.9828

Elegimos la clase 3 como la positiva porque al creemos que es más importante detectar los pacientes que están teniendo una progresión de la enfermedad que los que la enfermedad está progresando o al menos estable. Podemos ver que el modelo si que encuentra aquellos que están empeorando (clase 3).

El índice de kappa nos indica que las predicciones están cerca de haberse realizado por casualidad. Podriamos decir que esto se debe a la clase 2 sobre todo donde se encuentran pacientes con una ligera pseudoprogresión y la enfermedad estable, es decir, que están entre la clase 1 y 3 tres totalmente.

Sin embargo, si miramos el valor de balanced accuracy nos sugiere que el modelo no está haciendo tan malas predicciones y que al menos si nos centramos en predecir los que son de la clase 3 lo está haciendo correctamente.

De este modelo podemos extraer como conclusión que quizas habría que redefinir las clases propuestas ya que podrían haber más o menos clases que nos ayudarían a separar mejor los pacientes con este PLS.

SCALING

myplsda2 = opls(x = X2, y = Y2, predI = NA, crossvalI = nrow(X), scaleC = "standard", fig.pdfC = "none")
## PLS-DA
## 34 samples x 66 variables and 1 response
## standard scaling of predictors and response(s)
##       R2X(cum) R2Y(cum) Q2(cum) RMSEE pre ort pR2Y  pQ2
## Total    0.188    0.203  0.0777 0.376   1   0 0.05 0.05

NUMBER OF COMPONENTS SELECTION

myplsda2 = opls(x = X2, y = Y2, predI = 33, crossvalI = nrow(X), scaleC = "standard", fig.pdfC = "none")
## PLS-DA
## 34 samples x 66 variables and 1 response
## standard scaling of predictors and response(s)
##       R2X(cum) R2Y(cum) Q2(cum) RMSEE pre ort pR2Y  pQ2
## Total        1        1       1   Inf  33   0 1.05 1.05
plot(1:33, myplsda2@modelDF$`R2Y(cum)`, type = "o", pch = 16, col = "blue3", lwd = 2, xlab = "Components", ylab = "", main = "PLS-DA model for predicting Best Response", , ylim = c(-0.5, 1))

lines(1:33, myplsda2@modelDF$`Q2(cum)`, type = "o", pch = 16, col = "red3", lwd = 2)
abline(h = 0, col = "red3", lty = 2)
legend("bottomleft", c("R2Y", "Q2"), lwd = 2, 
       col = c("blue3", "red3"), bty = "n")

myplsda2 = opls(x = X2, y = Y2, predI = 2, crossvalI = nrow(X), scaleC = "standard", fig.pdfC = "none")
## PLS-DA
## 34 samples x 66 variables and 1 response
## standard scaling of predictors and response(s)
##       R2X(cum) R2Y(cum) Q2(cum) RMSEE pre ort pR2Y  pQ2
## Total    0.327    0.354   0.115 0.347   2   0 0.05 0.05

EXTREME INDIVIDUALS STUDY

misScores = myplsda2@scoreMN
varT = apply(misScores, 2, var)
miT2 = colSums(t(misScores**2) / varT)
N = nrow(X2)
A = 4
F95 = A*(N**2 - 1)/(N*(N - A)) * qf(0.95, A, N-A);
F99 = A*(N**2 - 1)/(N*(N - A)) * qf(0.99, A, N-A);

plot(1:length(miT2), miT2, type = "l", xlab = "Patients", ylab = "T2",
     main = "PLS: T2-Hotelling",ylim=c(0,20))
abline(h = F95, col = "orange", lty = 2, lwd = 2)
abline(h = F99, col = "red3", lty = 2, lwd = 2)

myT = myplsda2@scoreMN
myP = myplsda2@loadingMN
myE = scale(X2) - myT%*%t(myP) 
mySCR = rowSums(myE^2)   # SPE 
plot(1:length(mySCR), mySCR, type = "l", main = "SCR", 
     xlab = "Patients")

g = var(mySCR)/(2*mean(mySCR))
h = (2*mean(mySCR)^2)/var(mySCR)
chi2lim = g*qchisq(0.95, df = h)
abline(h = chi2lim, col = "orange", lty = 2)
chi2lim99 = g*qchisq(0.99, df = h)
abline(h = chi2lim99, col = "red3", lty = 2)

GRAPHICS

plot(x = myplsda2, typeVc = "x-score",
     parCexN = 0.8, parCompVi = c(1, 2), parPaletteVc = NA,
     parTitleL = TRUE, parCexMetricN = NA)

plot(x = myplsda2, typeVc = "xy-weight",
     parCexN = 0.7, parCompVi = c(1, 2), parPaletteVc = NA, 
     parTitleL = TRUE, parCexMetricN = NA)

VIP = data.frame(sort(myplsda2@vipVn))
bottom_10 <- data.frame(variables = rownames(VIP)[1:10], vip = VIP[1:10, ])
top_10 <- data.frame(variables = rownames(VIP)[46:55], vip = VIP[46:55, ])

grafico_barras <- ggplot(top_10, aes(x = reorder(variables, vip), y = vip)) + geom_bar(stat = "identity", fill = "skyblue") + theme(axis.text.x = element_text(angle = 45, hjust = 1)) +  labs(x = "Variable", y = "VIP", title = "Top 10 VIP variables")

grafico_barras2 <- ggplot(bottom_10, aes(x = reorder(variables, vip), y = vip)) + geom_bar(stat = "identity", fill = "skyblue") + theme(axis.text.x = element_text(angle = 45, hjust = 1)) +  labs(x = "Variable", y = "VIP", title = "Bottom 10 VIP variables")

print(grafico_barras)

print(grafico_barras2)

EVALUATION METRICS FOR CLASSIFICATION

mypred2 = predict(myplsda2)
actual = factor(df2$mejor_resp_num_ok)
mypred2 <- factor(mypred2, levels = levels(actual))
confusionMatrix(mypred2, actual, positive = "3")
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction  0  1  2  3
##          0  0  0  0  0
##          1  4 17  3  0
##          2  0  0  3  0
##          3  1  0  1  5
## 
## Overall Statistics
##                                           
##                Accuracy : 0.7353          
##                  95% CI : (0.5564, 0.8712)
##     No Information Rate : 0.5             
##     P-Value [Acc > NIR] : 0.004521        
##                                           
##                   Kappa : 0.5578          
##                                           
##  Mcnemar's Test P-Value : NA              
## 
## Statistics by Class:
## 
##                      Class: 0 Class: 1 Class: 2 Class: 3
## Sensitivity            0.0000   1.0000  0.42857   1.0000
## Specificity            1.0000   0.5882  1.00000   0.9310
## Pos Pred Value            NaN   0.7083  1.00000   0.7143
## Neg Pred Value         0.8529   1.0000  0.87097   1.0000
## Prevalence             0.1471   0.5000  0.20588   0.1471
## Detection Rate         0.0000   0.5000  0.08824   0.1471
## Detection Prevalence   0.0000   0.7059  0.08824   0.2059
## Balanced Accuracy      0.5000   0.7941  0.71429   0.9655